#author; Danielle Remmerswaal
#last update: 18-01-2024



# 3. Data quantity and quality ------------


# 3.1 Data preparement -------
#load classified periods
combineddays <- read.csv(file = "Data/combineddays.csv") #userid, brand, apptime, #location sensors, stages, confirmed day. PER DAY
classperiods <- read.csv(file = "Data/ClassifiedPeriodslabelled.csv")
## classtimes
ClassTimes <- classperiods%>%
  mutate(durationperiod_H = (end_date-start_date)/(1000*60*60),
  ) %>% 
  group_by(user_id)%>%
  arrange(start_date) %>% 
  mutate( timestamp_createdon = structure(round(created_on/1000), 
                                          class = c("POSIXct", "POSIXt"), 
                                          tzone = "Europe/Amsterdam"), 
          timestamp_endofperiod = structure(round(end_date/1000), 
                                            class = c("POSIXct", "POSIXt"), 
                                            tzone = "Europe/Amsterdam"), 
          timestamp_startofperiod = structure(round(start_date/1000), 
                                              class = c("POSIXct", "POSIXt"), 
                                              tzone = "Europe/Amsterdam"),
          duration_study_user_komma = (end_date[length(end_date)]-start_date[1])/(1000*60*60*24),
          date = as_date(timestamp_startofperiod)
  ) %>% 
  arrange(timestamp_startofperiod)%>%
  mutate(
    createdonperiodday = ifelse(test =  as_date(timestamp_createdon)== as_date(timestamp_endofperiod) , yes = 1, 
                                no =  ifelse(as_date(timestamp_createdon)== as_date(timestamp_startofperiod), yes = 1 , no= 0 ) ),
    
    start_time_user = timestamp_startofperiod[1],    #1st period   !! make sure dataset is arranged by start_time
    start_day_user = as_date(start_time_user), 
    start_weekday_user = wday(start_time_user), 
    end_time_user = timestamp_endofperiod[length(timestamp_endofperiod)],  #timestamp last  period
    duration_period = as_date(timestamp_endofperiod)-as_date(timestamp_startofperiod),
    duration_study_user = as_date(end_time_user)-as_date(start_time_user),  #1st to last classified period
    # duration_study_user_komma = (end_time_user)-(start_time_user),  #1st to last classified period
    StudyDay = as.numeric(as_date(timestamp_startofperiod)-as_date(start_day_user)),  #day of classified period 
    edited = ifelse(test = uuid %in% origin[origin_cat=="edit"] , yes= 1, no=0), 
    edited_remain = ifelse(test = edited ==1 & deleted_on==0, 1, 0)      #add time editing when starttime is not the same
  ) %>% 
  #filter(durationperiod_H>(1/60) & durationperiod_H<24 ) %>%  #filter unrealistic periods with <1 minutes or >24 hours
  distinct() %>% 
  group_by(user_id, date) %>% #daily overview per full day. so the first hours of data of day of registration are excluded here
  mutate(periods_perday_withDels= length(user_id),
         periods_perday_noDels = length(user_id[deleted_on==0]),
         periods_perday_edited = length(user_id[edited==1]),
         periods_perday_stops = length(user_id[deleted_on==0 & !is.na(vehicle_id)]),
         periods_perday_moves = length(user_id[deleted_on==0 & !is.na(reason_id)]),
         #perc_dels_daily = (periods_perday_withDels-periods_perday_noDels-periods_perday_edited)/periods_perday_withDels*100,
         perc_dels_daily = (periods_perday_withDels-periods_perday_noDels)/periods_perday_withDels*100,
         #mean(deleted_on>0)
         unlabeled_perday = sum(confirmed=='f'& deleted_on==0),   #labeled periods can also be removed
         labels_perday = sum(confirmed=='t'& deleted_on==0),   #labeled periods can also be removed
         labels_perday_stops = sum(vehicle_id>0 & deleted_on==0, na.rm = T),   #labeled periods can also be removed
         labels_perday_moves = sum(reason_id>0 & deleted_on==0, na.rm = T),   #labeled periods can also be removed
         perc_labeled = labels_perday/periods_perday_noDels*100,
         perc_labeled_stops = labels_perday_stops/length(user_id[deleted_on==0 & !is.na(vehicle_id)])*100,
         perc_labeled_moves = labels_perday_moves/length(user_id[deleted_on==0 & !is.na(reason_id)])*100,
         selfmadeperiods = length(uuid[deleted_on==0 & origin_cat =='respondent_new']),
         perc_selfmade = selfmadeperiods/periods_perday_noDels*100,
  )
#write.csv(ClassTimes, file = "Data/ClassTimes.csv")
ClassTimes <- read.csv(file = "Data/ClassTimes.csv") %>%   
  rename(confirmed_period = confirmed)
faseinfo23 <- read.csv(file = "Data/faseinfo.csv") %>% filter(user_id!="") %>% unique()
faseInfo2 <- read.csv(file = "Data/faseinfo.csv") %>% dplyr::select(user_id, VariantCode, studylength, fase, brand) %>% distinct()
TrackedDays <- read.csv(file = "Data/TrackedDays.csv") %>% dplyr::select(confirmed:appsentiment)  %>% rename(confirmed_day=confirmed)
sensorsperday <- read.csv(file="../data/02_summary_per_userday.csv") %>% rename(date=day) 
DailyAppuse <- read.csv(file = "../Respondentacties/DanielleR/DailyAppUse.csv") %>% dplyr::select(-X)%>% rename(date = dag_melding)
combineddays <- merge(sensorsperday , DailyAppuse, by = c("user_id", "date"), all = T) %>% merge(faseInfo2)
combineddays2 <- merge(combineddays, TrackedDays, by = c("user_id", "date"), all.x=T) 
OverviewDays <- merge(ClassTimes, combineddays2, by = c("user_id", "date", "fase", "studylength", "VariantCode"), all = T) #date = hier datum van periode
questionnaire <- read.csv("../../Export tabellen 24-02-23/questionnaires_parsed_dm.csv", sep=',') #%>% select(id[1], day:user_id)
OverviewDays$pre_q <- ifelse(test = OverviewDays$user_id %in% questionnaire$user_id, yes=1, no = 0 )
#write.csv(OverviewDays, file = "Data/OverviewDaysSept.csv")
OverviewDays <- read.csv(file = "Data/OverviewDaysSept.csv")


#3.2 summarize per day ------------

OverviewDays_daily <- OverviewDays %>% 
  group_by(user_id, studylength, fase, VariantCode) %>% 
  dplyr::select(user_id, studylength, fase, VariantCode, pre_q,
                date,  
                periods_perday_withDels,periods_perday_noDels,perc_dels_daily,
                periods_perday_stops, periods_perday_moves,
                labels_perday_moves, labels_perday_stops,
                labels_perday, perc_labeled ,selfmadeperiods,
                perc_labeled_moves, perc_labeled_stops,
                perc_selfmade,
                messages_perday_tot, Nsessions_perday, Apptime_perday_sec,
                n_observations, n_hours, n_minutes
  ) %>% 
  distinct() %>% arrange(user_id, date) %>% 
  mutate(twelveH = ifelse(test = n_hours>=12, 1, 0)) %>% 
  group_by(user_id) %>%  #arrange(date) %>% 
  mutate( Studiedag = as.numeric(lubridate::ymd(date) - min(lubridate::ymd(date)) )+1) %>%   #day of classified period such as visible in participants app
  dplyr::select(user_id, studylength, fase, VariantCode, pre_q,
                date, Studiedag, 
                periods_perday_withDels,periods_perday_noDels,perc_dels_daily,
                periods_perday_stops, periods_perday_moves,
                labels_perday_moves, labels_perday_stops,
                labels_perday, perc_labeled ,selfmadeperiods,
                perc_labeled_moves, perc_labeled_stops,
                perc_selfmade,
                messages_perday_tot, Nsessions_perday, Apptime_perday_sec,
                n_observations, n_hours, n_minutes, twelveH
  ) %>% distinct() %>%  
  mutate_all(~replace(., is.na(.), 0))

#write.csv(OverviewDays_daily, file = "Data/OverviewDaysDailyOKT.csv")

OverviewDays_daily23 <- OverviewDays_daily %>% 
  filter(fase>1) %>% 
  filter(Studiedag<8) %>% 
  group_by(user_id) %>% 
  mutate(SUMobservations = sum(n_observations, na.rm=T),
         TOT_hours = sum(n_hours, na.rm=T),
         TOT_diarydays = sum(n_hours>=1, na.rm=T),
         TOT_dayswselfmade = sum(selfmadeperiods>=1, na.rm=T),
         TOT_dayswlabels = sum(labels_perday>=1, na.rm=T),
         labeled = ifelse(labels_perday>0 , 1, 0),
         selfmade = ifelse(selfmadeperiods>0  , 1, 0),
         stage2= ifelse(SUMobservations>0, 1, 0),
         complete = case_when(
           studylength==1 & ((n_observations>0|messages_perday_tot>0 ) & Studiedag>1) ~ '1',   #on day 3 there is a sign of life (n_observations or n_messages)
           studylength==7 & ((n_observations>0|messages_perday_tot>0) & Studiedag==7 )~ '1',   #on day 7 there was a sign of life
           is.na(n_observations)~'0',
           #.default = '0'
           TRUE~'0')
  )  

#studiedag 2, stage 2
length(unique(OverviewDays_daily23$user_id[OverviewDays_daily23$stage2==1]))

OverviewDays_STAGE2 <- OverviewDays_daily23 %>%   filter(stage2==1)
idsstage1 = unique(OverviewDays_daily23$user_id)
idsstage2 = unique(OverviewDays_STAGE2$user_id)
idsstage3 = unique(OverviewDays_daily23$user_id[which(OverviewDays_STAGE2$complete==1)])
OverviewDays_STAGE3 <-  OverviewDays_STAGE2 %>%   filter(user_id %in% idsstage3)
length(unique(OverviewDays_STAGE2$user_id))
length(unique(OverviewDays_STAGE3$user_id))

#faseInfo23 <- read.csv(file = "Data/faseinfo.csv") %>% filter(user_id!="") %>%  filter(fase>1) %>% dplyr::select(-X) %>% distinct() %>%
# mutate(
#    Stage1 = ifelse(user_id %in% idsstage1, 1, 0),
#    Stage2 = ifelse(user_id %in% idsstage2, 1, 0),
#    Stage3 = ifelse(user_id %in% idsstage3, 1, 0) )
#write.csv(faseInfo23, file = "Data/faseinfo2.csv")


#3.3 results ------------------
#Data quality STAGE 1: registrered in app --------
#      not applicable
#Data quality STAGE 2: have any location measurements -------------
#use the same users who have location measurements on day 2
OverviewDays_STAGE2dag2 <- OverviewDays_STAGE2 %>% filter(Studiedag==2) %>% filter(n_observations>0)
idsstage2day2 = unique(OverviewDays_STAGE2dag2$user_id)
OverviewDays_STAGE3dag2 <- OverviewDays_STAGE3 %>% filter(Studiedag==2)  %>% filter(n_observations>0)
idsstage3day2 = unique(OverviewDays_STAGE3dag2$user_id)

compleetweek<-  OverviewDays_STAGE2  %>% 
  filter(Studiedag>1) %>% 
  filter(user_id %in% idsstage2day2) %>% 
  group_by(studylength) %>% 
  summarise(
    N_participants = length(unique(user_id)),
    Nwithobservations = length(unique(user_id[n_observations>0])),
    Nwithperiods = length(unique(user_id[periods_perday_withDels>0])),
    N_fase2 = length(unique(user_id[fase==2])),
    N_fase2periods = length(unique(user_id[fase==2 & periods_perday_withDels>0])),
    TOTALhoursdata = sum(n_hours, na.rm=T),
    TOTALminutesdata = sum(n_minutes, na.rm = T),
    TOTALdiarydays = sum(n_hours>=1, na.rm=T),
    TOTALdiarydays_phase2 = sum(n_hours>=1 & fase==2, na.rm=T),
    TOTALperioddays = sum(periods_perday_withDels>=1, na.rm=T),
    TOTALlabeldays = sum(labels_perday>=1, na.rm=T),
    MEANhoursdiaryday = TOTALhoursdata/TOTALdiarydays,                          
    MEANminutesdiaryday = TOTALminutesdata/TOTALdiarydays,                        
    TOTALdayswselfmade = sum(selfmadeperiods>=1, na.rm=T),
    MEANdayspp = TOTALdiarydays/Nwithobservations,
    Nwithlabels = length(unique(user_id[labels_perday>0])),
    NwithlabelsSTOP = length(unique(user_id[labels_perday_stops>0])),
    NwithlabelsMOVE = length(unique(user_id[labels_perday_moves>0])),
    Nwithselfmade = length(unique(user_id[selfmadeperiods>0])),
    perc_periods = Nwithperiods/Nwithobservations, #not everyone with measurements saw stops/trips in their daily overview in teh app
    perc_periods_daily = TOTALperioddays/TOTALdiarydays,
    perc_labels_daily = TOTALlabeldays/TOTALperioddays,
    perc_self_daily = TOTALdayswselfmade/TOTALdiarydays_phase2,
    perc_labeling = Nwithlabels/Nwithobservations,
    perc_labelingSTOP = NwithlabelsSTOP/Nwithobservations,
    perc_labelingMOVE = NwithlabelsMOVE/Nwithobservations,
    perc_labeling2 = Nwithlabels/Nwithperiods,   #only users who get to see stops/trips in their daily overview
    perc_useradded = Nwithselfmade/N_fase2,
    perc_useradded2 = Nwithselfmade/N_fase2periods,
    Timeinapp = mean(Apptime_perday_sec, na.rm=T)
  )
1-compleetweek$perc_periods
compleetweek$Timeinapp
#first full study day 

# TABLE 5 ACTIVITY OF USERS WITH ANY DATA ON FIRST FULL DAY-----------
#Use the data (point estimates and 95% CIs) from the bootstrap below
#day2=OverviewDays_daily23 %>%  
compleetdag <- OverviewDays_STAGE2dag2 %>% #OverviewDays_STAGE2 %>% 
  group_by(studylength) %>% #fase of studylength
  filter(Studiedag==2) %>% #study day 2 is the first full day 
  summarise(
    N_participants = length(unique(user_id)),
    N_fase2 = length(unique(user_id[fase==2& n_observations>0])),
    N_fase2periods = length(unique(user_id[fase==2 & periods_perday_withDels>0])),
    TOTALhoursdata = sum(n_hours, na.rm = T),
    TOTALminutesdata = sum(n_minutes, na.rm = T),
    #TOTALdiarydays = sum(TOT_diarydays, na.rm=T),
    TOTALdayswselfmade = sum(TOT_dayswselfmade, na.rm=T),
    TOTALdayswlabels = sum(TOT_dayswlabels, na.rm=T),
    Nwithobservations = length(unique(user_id[n_observations>0])),
    Nwithdels = length(unique(user_id[perc_dels_daily>0])),
    Nwithperiods = length(unique(user_id[periods_perday_withDels>0])),
    Nwithlabels = length(unique(user_id[labels_perday>0])),
    Nwithselfmade = length(unique(user_id[selfmadeperiods>0])),
    MEANhoursdiaryday = TOTALhoursdata/Nwithobservations,                         
    MEANminutesdiaryday = TOTALminutesdata/Nwithobservations,  
    sdhoursdiaryday = sd(n_hours, na.rm=T),
    sdminutesdiaryday =sd(n_minutes, na.rm=T),
    perc_periods = Nwithperiods/Nwithobservations, #not everyone with measurements saw stops/trips in their daily overview in teh app
    perc_labeling = Nwithlabels/Nwithperiods,
    perc_useradded = Nwithselfmade/N_fase2periods
  )




#datasets to use 
een <- OverviewDays_STAGE2dag2 %>% filter(studylength==1)
zeven <- OverviewDays_STAGE2dag2 %>% filter(studylength==7)
lang <- OverviewDays_STAGE2 %>% filter(studylength==7) %>%   
  filter(Studiedag>1) %>% filter(user_id %in% idsstage2day2) 
#descriptive statistics
#hours
hist(een$n_hours)
hist(zeven$n_hours)
hist(lang$n_hours)
#minutes
hist(een$n_minutes)
hist(zeven$n_minutes)
hist(lang$n_minutes)
#% with periods
mean(ifelse(een$periods_perday_withDels>0, 1,0)/ifelse(een$n_minutes>0, 1,0))
mean(ifelse(zeven$periods_perday_withDels>0, 1,0))/mean(ifelse(zeven$n_minutes>0, 1,0))
mean(ifelse(lang$periods_perday_withDels>0, 1,0))/mean(ifelse(lang$n_minutes>0, 1,0))
#labels %
mean(ifelse(een$labels_perday>0, 1,0))/mean(ifelse(een$periods_perday_withDels>0, 1,0))
mean(ifelse(zeven$labels_perday>0, 1,0))/mean(ifelse(zeven$periods_perday_withDels>0, 1,0))
mean(ifelse(lang$labels_perday>0, 1,0))/mean(ifelse(lang$periods_perday_withDels>0, 1,0))
# % days with manually added stops/tracks
mean(ifelse(een$selfmadeperiods>0&een$fase==2, 1,0))/mean(ifelse(een$fase==2, 1,0))
mean(ifelse(zeven$selfmadeperiods>0, 1,0))/mean(ifelse(zeven$fase==2, 1,0))
mean(ifelse(lang$selfmadeperiods>0, 1,0))/mean(ifelse(lang$fase==2, 1,0))

#set up bootstrap
R=1000
#days 95% CI with bootstrap
# use values 25 and 975 

##table 5: first full day of 1-day group----------
een <- OverviewDays_STAGE2dag2 %>% filter(studylength==1)

#1 day group, n_hours
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = een[sample(1:nrow(een), nrow(een), replace = TRUE), ]
  # Calculate the rate in the sample: 
  n_hours = ifelse(sample_d$n_hours >24, 24, sample_d$n_hours) #in case bootstrap samples higher than 24 hours, replace with the maximum, 24 hours
  n_hours = mean(n_hours, na.rm=T) #give the mximum value
  # Save the results:  one at a time
  Proportion_boot <- c(Proportion_boot, n_hours)
}
mean(Proportion_boot) # Bootstrapped mean 
quantile(Proportion_boot, probs = c(0.025, 0.975)) # Bootstrapped CI 


#1 day group, n_minutes
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = een[sample(1:nrow(een), nrow(een), replace = TRUE), ]
  # Calculate the rate in the sample: 
  n_minutes = ifelse(sample_d$n_minutes >(24*60), (24*60), sample_d$n_minutes)#in case bootstrap samples higher than 24 hours, replace with the maximum
  n_minutes = mean(sample_d$n_minutes,na.rm=T)
  # Save the results:  one at a time
  Proportion_boot <- c(Proportion_boot, n_minutes)
}
mean(Proportion_boot) # Bootstrapped mean 
quantile(Proportion_boot, probs = c(0.025, 0.975)) # Bootstrapped CI 

Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = een[sample(1:nrow(een), nrow(een), replace = TRUE), ]
  # Calculate the rate in the sample: 
  Nwithobservations = length(unique(sample_d$user_id[sample_d$n_observations>0]))
  Nwithperiods = length(unique(sample_d$user_id[sample_d$periods_perday_withDels>0]))
  perc_periods = Nwithperiods/Nwithobservations
  # Save the results:  one at a time
  Proportion_boot <- c(Proportion_boot, perc_periods)
}
mean(Proportion_boot) # Bootstrapped mean 
quantile(Proportion_boot, probs = c(0.025, 0.975))*100 # Bootstrapped CI

Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = een[sample(1:nrow(een), nrow(een), replace = TRUE), ]
  # Calculate the rate in the sample: 
  Nwithperiods = length(unique(sample_d$user_id[sample_d$periods_perday_withDels>0]))
  Nwithlabels = length(unique(sample_d$user_id[sample_d$labels_perday>0]))
  perc_labeling = Nwithlabels/Nwithperiods
  # Save the results:  one at a time
  Proportion_boot <- c(Proportion_boot, perc_labeling)
}
mean(Proportion_boot) # Bootstrapped mean 
quantile(Proportion_boot, probs = c(0.025, 0.975))*100 # Bootstrapped CI

Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = een[sample(1:nrow(een), nrow(een), replace = TRUE), ]
  # Calculate the rate in the sample: 
  Nwithselfmade = length(unique(sample_d$user_id[sample_d$selfmadeperiods>0]))
  N_fase2 = length(unique(sample_d$user_id[sample_d$fase==2& sample_d$n_observations>0]))
  perc_useradded = Nwithselfmade/N_fase2
  # Save the results:  one at a time
  Proportion_boot <- c(Proportion_boot, perc_useradded)
}
mean(Proportion_boot) # Bootstrapped mean 
quantile(Proportion_boot, probs = c(0.025, 0.975))*100 # Bootstrapped CI


## table 5: 7-day group: first full day---------
R=1000
zeven <- OverviewDays_STAGE2dag2 %>% filter(studylength==7)
#days 95% CI with bootstrap
# use values 25 and 975 
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 

#7 day group, n_hours
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = zeven[sample(1:nrow(zeven), nrow(zeven), replace = TRUE), ]
  # Calculate the rate in the sample: 
  n_hours = ifelse(sample_d$n_hours >24, 24, sample_d$n_hours) #in case bootstrap samples higher than 24 hours, replace with the maximum, 24 hours
  n_hours = mean(n_hours, na.rm=T) #give the mximum value
  # Save the results:  one at a time
  Proportion_boot <- c(Proportion_boot, n_hours)
}
mean(Proportion_boot) # Bootstrapped mean 
quantile(Proportion_boot, probs = c(0.025, 0.975)) # Bootstrapped CI 

#7 day group, n_minutes
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = zeven[sample(1:nrow(zeven), nrow(zeven), replace = TRUE), ]
  # Calculate the rate in the sample: 
  n_minutes = ifelse(sample_d$n_minutes >(24*60), (24*60), sample_d$n_minutes)#in case bootstrap samples higher than 24 hours, replace with the maximum
  n_minutes = mean(sample_d$n_minutes,na.rm=T)
  # Save the results:  one at a time
  Proportion_boot <- c(Proportion_boot, n_minutes)
}
mean(Proportion_boot) # Bootstrapped mean 
quantile(Proportion_boot, probs = c(0.025, 0.975)) # Bootstrapped CI 

#7 day group percentage compiled periods
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = zeven[sample(1:nrow(zeven), nrow(zeven), replace = TRUE), ]
  # Calculate the rate in the sample: 
  Nwithobservations = length(unique(sample_d$user_id[sample_d$n_observations>0]))
  Nwithperiods = length(unique(sample_d$user_id[sample_d$periods_perday_withDels>0]))
  perc_periods = Nwithperiods/Nwithobservations
  # Save the results:  one at a time
  Proportion_boot <- c(Proportion_boot, perc_periods)
}
mean(Proportion_boot) # Bootstrapped mean 
quantile(Proportion_boot, probs = c(0.025, 0.975))*100 # Bootstrapped CI

#7 day group, first day, labeling percentage
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = zeven[sample(1:nrow(zeven), nrow(zeven), replace = TRUE), ]
  # Calculate the rate in the sample: 
  Nwithperiods = length(unique(sample_d$user_id[sample_d$periods_perday_withDels>0]))
  Nwithlabels = length(unique(sample_d$user_id[sample_d$labels_perday>0]))
  perc_labeling = Nwithlabels/Nwithperiods
  # Save the results:  one at a time
  Proportion_boot <- c(Proportion_boot, perc_labeling)
}
mean(Proportion_boot) # Bootstrapped mean 
quantile(Proportion_boot, probs = c(0.025, 0.975))*100 # Bootstrapped CI

#7 day group, first day, percentage of users who self added stops or trips
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = zeven[sample(1:nrow(zeven), nrow(zeven), replace = TRUE), ]
  # Calculate the rate in the sample: 
  Nwithselfmade = length(unique(sample_d$user_id[sample_d$selfmadeperiods>0]))
  N_fase2 = length(unique(sample_d$user_id[sample_d$fase==2& sample_d$n_observations>0]))
  perc_useradded = Nwithselfmade/N_fase2
  # Save the results:  one at a time
  Proportion_boot <- c(Proportion_boot, perc_useradded)
}
mean(Proportion_boot) # Bootstrapped mean 
quantile(Proportion_boot, probs = c(0.025, 0.975))*100 # Bootstrapped CI



##table 5: 7-day group first full six days------
R=1000
lang <- OverviewDays_STAGE2 %>% filter(studylength==7) %>%   
  filter(Studiedag>1) %>% filter(user_id %in% idsstage2day2) 
#days 95% CI with bootstrap
# use values 25 and 975 
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap n hours
  # Create resampled dataset data:  
  sample_d = lang[sample(1:nrow(lang), nrow(lang), replace = TRUE), ]
  # Calculate the rate in the sample: 
  #for week long, 
  n_hours = ifelse(sample_d$n_hours >24, 24, sample_d$n_hours)
  n_hours = mean(n_hours, na.rm=T) #give the mximum value
  # Save the results:  
  Proportion_boot <- c(Proportion_boot, n_hours)
}
mean(Proportion_boot) # Bootstrapped mean %
quantile(Proportion_boot, probs = c(0.025, 0.975)) # Bootstrapped CI %
hist(Proportion_boot)

Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap 
  # Create resampled dataset data:  
  sample_d = lang[sample(1:nrow(lang), nrow(lang), replace = TRUE), ]
  # Calculate the rate in the sample: 
  #for week long, 
  n_minutes = ifelse(sample_d$n_minutes >(24*60), (24*60), sample_d$n_minutes)
  n_minutes = mean(sample_d$n_minutes,na.rm=T)
  # Save the results:  
  Proportion_boot <- c(Proportion_boot, n_minutes)
}
mean(Proportion_boot) # Bootstrapped mean %
quantile(Proportion_boot, probs = c(0.025, 0.975)) # Bootstrapped CI %

#percentage of days with compiled diaries 
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap n hours
  # Create resampled dataset data:  
  sample_d = lang[sample(1:nrow(lang), nrow(lang), replace = TRUE), ]
  # Calculate the rate in the sample: 
  #for week long, 
  TOTALperioddays = sum(ifelse(sample_d$periods_perday_withDels>=1,1,0))
  TOTALlocationdays = sum(ifelse(sample_d$n_minutes>=1, 1, 0))
  perc_periods = TOTALperioddays/TOTALlocationdays  
  # Save the results:  
  Proportion_boot <- c(Proportion_boot, perc_periods)
}
mean(Proportion_boot)*100 # Bootstrapped mean %
quantile(Proportion_boot, probs = c(0.025, 0.975))*100 # Bootstrapped CI %

#percentage of diary days with any labels
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap n hours
  # Create resampled dataset data:  
  sample_d = lang[sample(1:nrow(lang), nrow(lang), replace = TRUE), ]
  # Calculate the rate in the sample: 
  #for week long, 
  TOTALperioddays = sum(sample_d$periods_perday_withDels>=1, na.rm=T)
  TOTALlabeldays = sum(sample_d$labels_perday>=1, na.rm=T)
  perc_labels_daily = TOTALlabeldays/TOTALperioddays
  # Save the results:  
  Proportion_boot <- c(Proportion_boot, perc_labels_daily)
}
mean(Proportion_boot)*100 # Bootstrapped mean %
quantile(Proportion_boot, probs = c(0.025, 0.975))*100 # Bootstrapped CI %

#percentage of diary days with any manually added trips or stips by the user
Proportion_boot <- NULL # Storage for Bootstrap 
set.seed(2022) # Set seed 
for(r in 1:R){ # Bootstrap n hours
  # Create resampled dataset data:  
  sample_d = lang[sample(1:nrow(lang), nrow(lang), replace = TRUE), ]
  # Calculate the rate in the sample: 
  #for week long, 
  TOTALdiarydays_phase2 = sum(sample_d$fase==2, na.rm=T)
  TOTALdayswselfmade = sum(sample_d$selfmade, na.rm=T)
  perc_self_daily = TOTALdayswselfmade/TOTALdiarydays_phase2
  # Save the results:  
  Proportion_boot <- c(Proportion_boot, perc_self_daily)
}
mean(Proportion_boot)*100 # Bootstrapped mean %
quantile(Proportion_boot, probs = c(0.025, 0.975))*100 # Bootstrapped CI %

#END OF SCRIPT------









